library(Seurat)
library(ggplot2)
library(grid)
library("GSEABase")
library(stringr)
library(dplyr)
library(Matrix)
library(parallel)
library(SiPSiC)

# minimalClusterSize defines the percentage of cells that constitute the minimal expected cluster size 
minimalClusterSize = 10
logScalingConstant = 1
minNumOfGenesExpressed = 1000

# This function returns the correct meta module of the input cell of the glioblastoma data
getMetaModule <- function(cellCoordinates)
{
  X <- as.numeric(cellCoordinates[["X"]]); Y <- as.numeric(cellCoordinates[["Y"]])
  
  if ((X > 0) && (Y > 0)) return ("NPC-Like")
  if ((X < 0) && (Y > 0)) return ("OPC-Like")
  if ((X < 0) && (Y < 0)) return ("AC-Like")
  if ((X > 0) && (Y < 0)) return ("MES-Like")
}


filterData <- function(dataMatrix, isLogTPM, convertToCPM)
{
  filteredDataMatrix <- dataMatrix
  
  if (isLogTPM == TRUE)
  {
    filteredDataMatrix <- 2^(filteredDataMatrix) - logScalingConstant
  }
  
  # Filtering out cells which express less than the minimal number of genes
  expressedGenesCounters <- apply(filteredDataMatrix != 0, 2, sum)
  cellsWithAThousandPlus <- expressedGenesCounters >= minNumOfGenesExpressed
  filteredDataMatrix <- filteredDataMatrix[, cellsWithAThousandPlus]
  expressedGenesCounters <- expressedGenesCounters[cellsWithAThousandPlus]
  
  # Filtering out genes which are expressed by less than the minimal expected cluster size of cells
  nonZeroCellCountsForGenes <- apply(filteredDataMatrix != 0, 1, sum)
  totalCellsCount <- ncol(filteredDataMatrix)
  minNumOfCellsInClust <- totalCellsCount * (minimalClusterSize / 100)
  genesWithMinExpression <- (nonZeroCellCountsForGenes > minNumOfCellsInClust)
  filteredDataMatrix <- filteredDataMatrix[genesWithMinExpression,]
  
  # Converting the transcript counts to CPM
  if (convertToCPM == TRUE)
  {
    countSumsOfCells <- apply(filteredDataMatrix, 2, sum)
    filteredDataMatrix <- t(filteredDataMatrix)
    filteredDataMatrix <- (filteredDataMatrix / countSumsOfCells) * 1000000
    filteredDataMatrix <- t(filteredDataMatrix)
  }
  
  return (filteredDataMatrix)
}

# This function produces graphic representation of the pathway score differences between the different cell groups
executePathwayCalculations <- function(inputPathway, dataMatrix, allCellsScores)
{
  pathwayGenes <- inputPathway@geneIds
  pathwayName <- inputPathway@setName
  pathwayScores <- try(getPathwayScores(dataMatrix, pathwayGenes))
  
  scoresAsDataFrame <- as.data.frame(pathwayScores$pathwayScore)
  scoresAsDataFrame$cellName <- rownames(scoresAsDataFrame)
  
  allCellsScores <- merge(x = allCellsScores, y = scoresAsDataFrame,
                          by.x = "NAME", by.y = "cellName", all = FALSE)
  cellNames <- allCellsScores[,"NAME"]
  names(allCellsScores)[names(allCellsScores) == "pathwayScores$pathwayScore"] <- "Score"
  allCellsScores <- allCellsScores[order(allCellsScores[,2], allCellsScores[,3]),]
  allCellsScores$MetaModule <- as.factor(allCellsScores$MetaModule)
  
  # Keeping the pathway scores of the different cells
  currPathwayScores <- allCellsScores[,"Score"]
  names(currPathwayScores) <- allCellsScores[,"NAME"]
  
  # The following line is critical to keep the correct score of each cell to the allPathwayScores data structure, 
  # which will be written to the final scores file. Without it - the scores are kept perfectly sorted, 
  # and for the wrong cell names.
  currPathwayScores <- currPathwayScores[cellNames]
  
  allPathwayScores <<- rbind(allPathwayScores, currPathwayScores)
  rownames(allPathwayScores)[nrow(allPathwayScores)] <<- pathwayName
  
  # Performing the T test using pairs of meta modules
  T.TestResult <- pairwise.t.test(allCellsScores$Score, allCellsScores$MetaModule, p.adjust.method="none")
  
  Curr_MES_vs_AC <- T.TestResult[[3]]["MES-Like", "AC-Like"]
  Curr_NPC_vs_AC <- T.TestResult[[3]]["NPC-Like", "AC-Like"]
  Curr_OPC_vs_AC <- T.TestResult[[3]]["OPC-Like", "AC-Like"]
  Curr_NPC_vs_MES <- T.TestResult[[3]]["NPC-Like", "MES-Like"]
  Curr_OPC_vs_MES <- T.TestResult[[3]]["OPC-Like", "MES-Like"]
  Curr_OPC_vs_NPC <- T.TestResult[[3]]["OPC-Like", "NPC-Like"]
  
  all_MES_vs_AC[pathwayName] <<- Curr_MES_vs_AC
  all_NPC_vs_AC[pathwayName] <<- Curr_NPC_vs_AC
  all_OPC_vs_AC[pathwayName] <<- Curr_OPC_vs_AC
  all_NPC_vs_MES[pathwayName] <<- Curr_NPC_vs_MES
  all_OPC_vs_MES[pathwayName] <<- Curr_OPC_vs_MES
  all_OPC_vs_NPC[pathwayName] <<- Curr_OPC_vs_NPC
  
  # Calculating effect size of the pathway
  medianAC <- median(allCellsScores[allCellsScores$MetaModule == "AC-Like", "Score"])
  medianMES <- median(allCellsScores[allCellsScores$MetaModule == "MES-Like", "Score"])
  medianNPC <- median(allCellsScores[allCellsScores$MetaModule == "NPC-Like", "Score"])
  medianOPC <- median(allCellsScores[allCellsScores$MetaModule == "OPC-Like", "Score"])
  
  EffectSize_MES_vs_AC <- medianMES - medianAC
  EffectSize_NPC_vs_AC <- medianNPC - medianAC
  EffectSize_OPC_vs_AC <- medianOPC - medianAC
  EffectSize_NPC_vs_MES <- medianNPC - medianMES
  EffectSize_OPC_vs_MES <- medianOPC - medianMES
  EffectSize_OPC_vs_NPC <- medianOPC - medianNPC
  
  allEffectSize_MES_vs_AC[pathwayName] <<- EffectSize_MES_vs_AC
  allEffectSize_NPC_vs_AC[pathwayName] <<- EffectSize_NPC_vs_AC
  allEffectSize_OPC_vs_AC[pathwayName] <<- EffectSize_OPC_vs_AC
  allEffectSize_NPC_vs_MES[pathwayName] <<- EffectSize_NPC_vs_MES
  allEffectSize_OPC_vs_MES[pathwayName] <<- EffectSize_OPC_vs_MES
  allEffectSize_OPC_vs_NPC[pathwayName] <<- EffectSize_OPC_vs_NPC
  
  allMedians <- c(medianNPC, medianMES, medianAC, medianOPC)
  effectSize <- max(allMedians) - min(allMedians)
  allEffectSizes[pathwayName] <<- effectSize
  
  violinPlot <- ggplot(allCellsScores, aes(x=MetaModule, y = Score, fill = MetaModule)) +
    ggtitle(paste0("UNADJUSTED (!) P values:\n\n", 
                   "MES vs AC: P < ", Curr_MES_vs_AC, "\n", "Effect Size MES - AC is: ", EffectSize_MES_vs_AC, "\n",
                   "NPC vs AC: P < ", Curr_NPC_vs_AC, "\n", "Effect Size NPC - AC is: ", EffectSize_NPC_vs_AC, "\n",
                   "OPC vs AC: P < ", Curr_OPC_vs_AC, "\n", "Effect Size OPC - AC is: ", EffectSize_OPC_vs_AC, "\n",
                   "NPC vs MES: P < ", Curr_NPC_vs_MES, "\n", "Effect Size NPC - MES is: ", EffectSize_NPC_vs_MES, "\n",
                   "OPC vs MES: P < ", Curr_OPC_vs_MES, "\n", "Effect Size OPC - MES is: ", EffectSize_OPC_vs_MES, "\n",
                   "OPC vs NPC: P < ", Curr_OPC_vs_NPC, "\n", "Effect Size OPC - NPC is: ", EffectSize_OPC_vs_NPC, "\n",
                   "Total effect size (Max median - Min median) is: ", effectSize)) +
    geom_violin(trim=FALSE) + geom_boxplot(width=0.1)
  
  pdf(paste0("NormalizedCounts_", pathwayName, "_allPatientsCells.pdf"))
  print(violinPlot)
  dev.off()
}


# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ MAIN program starts here ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

setwd("")

GMT_FILE_NAME <- "h.all.v7.0.symbols.pluscc.gmt"
genesets_name = "hallmarks"
genesets <- getGmt(GMT_FILE_NAME)

cellModules <- read.delim("IDHwt.GBM.Hierarchy.SS2.txt", header = TRUE, sep = "\t", dec = ".")
cellModules$MetaModule <- apply(cellModules, 1, getMetaModule)
allCellsScores <- dplyr::select(cellModules, NAME, MetaModule)
allCellsScores$NAME <- gsub('-', '.', allCellsScores$NAME)

allPatientsData <- as.matrix(read.table("IDHwtGBM.processed.SS2.logTPM.txt", row.names = 1, dec = ".", header = TRUE))
allGeneNames = rownames(allPatientsData)

# excluding cells for which no meta-module assignment exists in the data
cellsWithModuleAssignment <- colnames(allPatientsData) %in% allCellsScores$NAME
allPatientsData <- allPatientsData[, cellsWithModuleAssignment]
filteredPatientsData <- filterData(dataMatrix = allPatientsData, isLogTPM = TRUE, convertToCPM = FALSE)

all_MES_vs_AC <- numeric()
all_NPC_vs_AC <- numeric()
all_OPC_vs_AC <- numeric()
all_NPC_vs_MES <- numeric()
all_OPC_vs_MES <- numeric()
all_OPC_vs_NPC <- numeric()

allEffectSizes <- numeric()

allEffectSize_MES_vs_AC <- numeric()
allEffectSize_NPC_vs_AC <- numeric()
allEffectSize_OPC_vs_AC <- numeric()
allEffectSize_NPC_vs_MES <- numeric()
allEffectSize_OPC_vs_MES <- numeric()
allEffectSize_OPC_vs_NPC <- numeric()

allPathwayScores <- numeric()

mclapply(genesets@.Data, executePathwayCalculations, filteredPatientsData, allCellsScores)

# adjusting all P values 
adjusted_all_MES_vs_AC <-  p.adjust(all_MES_vs_AC, method = "BH", n = length(all_MES_vs_AC))
adjusted_all_NPC_vs_AC <- p.adjust(all_NPC_vs_AC, method = "BH", n = length(all_NPC_vs_AC))
adjusted_all_OPC_vs_AC <- p.adjust(all_OPC_vs_AC, method = "BH", n = length(all_OPC_vs_AC))
adjusted_all_NPC_vs_MES <- p.adjust(all_NPC_vs_MES, method = "BH", n = length(all_NPC_vs_MES))
adjusted_all_OPC_vs_MES <- p.adjust(all_OPC_vs_MES, method = "BH", n = length(all_OPC_vs_MES))
adjusted_all_OPC_vs_NPC <- p.adjust(all_OPC_vs_NPC, method = "BH", n = length(all_OPC_vs_NPC))

write.csv2(adjusted_all_MES_vs_AC, file = "NormalizedCounts_adjusted_all_MES_vs_AC.csv")
write.csv2(adjusted_all_NPC_vs_AC, file = "NormalizedCounts_adjusted_all_NPC_vs_AC.csv")
write.csv2(adjusted_all_OPC_vs_AC, file = "NormalizedCounts_adjusted_all_OPC_vs_AC.csv")
write.csv2(adjusted_all_NPC_vs_MES, file = "NormalizedCounts_adjusted_all_NPC_vs_MES.csv")
write.csv2(adjusted_all_OPC_vs_MES, file = "NormalizedCounts_adjusted_all_OPC_vs_MES.csv")
write.csv2(adjusted_all_OPC_vs_NPC, file = "NormalizedCounts_adjusted_all_OPC_vs_NPC.csv")

write.csv2(allEffectSizes, file = "NormalizedCounts_allEffectSizes.csv")

write.csv2(allEffectSize_MES_vs_AC, file = "NormalizedCounts_MES_vs_AC_EffectSizes.csv")
write.csv2(allEffectSize_NPC_vs_AC, file = "NormalizedCounts_NPC_vs_AC_EffectSizes.csv")
write.csv2(allEffectSize_OPC_vs_AC, file = "NormalizedCounts_OPC_vs_AC_EffectSizes.csv")
write.csv2(allEffectSize_NPC_vs_MES, file = "NormalizedCounts_NPC_vs_MES_EffectSizes.csv")
write.csv2(allEffectSize_OPC_vs_MES, file = "NormalizedCounts_OPC_vs_MES_EffectSizes.csv")
write.csv2(allEffectSize_OPC_vs_NPC, file = "NormalizedCounts_OPC_vs_NPC_EffectSizes.csv")

saveRDS(object = allPathwayScores, file = "SiPSiC_GBM_All_Pathway_Scores.RDS")